#!/bin/bash
#
# This script downloads all the pages from Mr Money Mustache's blog archive 
# and extracts the post content's HTML. Then, if Calibre is available in the 
# system, converts it into a nice and easy epub file for viewing in your 
# favorite ebook reader.
#
# Requirements besides the shell built-ins and common distro programs are:
#
#   - pup (https://github.com/ericchiang/pup): HTML parser written in Go
#   - calibre: optional, required to create epub files.
#   - wget: for downloading articles (with images!)
#   - curl: for downloading index file
#
# Warning: running this may put a large load in Mr Money Mustache's servers,
# which could be wrongly identified as a DDoS attack. Be polite and refrain 
# from running this very often.
#
# TODO: implement a tool to download only updates, so you don't have to 
#       download 500 pages every time one article comes out.
#
#    Copyright 2018 - kzimmermann - https://quitter.se/kzimmermann
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

if [[ -z $(which pup) ]]
then
    echo "This script requires pup to work, which we couldn't find here."
    echo "Please download the latest version before proceeding."
    exit 1
fi

article_list="http://www.mrmoneymustache.com/all-the-posts-since-the-beginning-of-time/"

# Parse article list to produce an "index" from which we will download pages:

cd /tmp

echo "Getting MMM's index page..."
curl -s "$article_list" | grep "https://www.mrmoneymustache.com/20" > links_raw

number=$(wc -l /tmp/links_raw | cut -d " " -f 1)
echo "There are $number articles to be downloaded."

# We need just the raw links.
# His HTTPS cert is wrong, which means that GnuTLS will complain if the links
# are left as they are. Turn them into HTTP instead:

cat links_raw | sed 's/<li><a href="https/http/g' | sed 's/">.*//g' > links_raw2

# Order it backwards (older first) so it reads more like a book

sort links_raw2 > links_raw

# Create a neat directory for tidiness, and start downloading pages.
# This can take a loooong time (about 500 articles!)

mkdir /tmp/mmmbook
cd /tmp/mmmbook

echo "Downloading article pages. Please be patient, as it can take a long time."

counter=1
while read line
do
    echo "Downloading article $counter of $number..."
    wget "$line" 2> /dev/null
    counter=$(expr $counter + 1)
done < /tmp/links_raw 

echo "Download completed."

# With all articles downloaded, now it's time to trim the garbage (headers,
# comments, etc) into a long, readable, HTML document. pup is important here.

echo "Composing HTML document..."

for article in index*
do
    cat "$article" | pup "div[class~=post_box]" >> mmm_book.html
done

# At this point, you are good to go. However, if you would like to make it into
# an ebook, install calibre and follow up on converting it into epub!

if [[ -n $(which ebook-convert) ]]
then
    echo "Converting into EPUB. This can take a while..."
    ebook-convert mmm_book.html mmm_book.epub &> /dev/null
    echo "Conversion complete. Your ebook is available here:"
    echo "/tmp/mmmbook/mmm_book.epub"
else
    echo "Calibre is not available in this system. Cannot convert to epub."
    echo "Your book (in html format) is available here:"
    echo "/tmp/mmmbook/mmm_book.html"
fi

exit 0
